In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as seabornInstance

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline
In [17]:
data = pd.read_csv("IMR4_state_IMR.csv")
In [18]:
data.head()
Out[18]:
Category Country/ State/ UT Name Infant mortality rate - 1971 Infant mortality rate - 1972 Infant mortality rate - 1973 Infant mortality rate - 1974 Infant mortality rate - 1975 Infant mortality rate - 1976 Infant mortality rate - 1977 Infant mortality rate - 1978 ... Infant mortality rate - 2003 Infant mortality rate - 2004 Infant mortality rate - 2005 Infant mortality rate - 2006 Infant mortality rate - 2007 Infant mortality rate - 2008 Infant mortality rate - 2009 Infant mortality rate - 2010 Infant mortality rate - 2011 Infant mortality rate - 2012
0 Country India (Average) 129.0 139.0 134.0 126.0 140.0 129.0 130.0 127.0 ... 60.0 58 58 57 55 53 50 47 44 42
1 State Andhra Pradesh 106.0 116.0 105.0 111.0 123.0 122.0 125.0 117.0 ... 59.0 59 57 56 54 52 49 46 43 41
2 State Assam 139.0 136.0 136.0 136.0 144.0 124.0 115.0 118.0 ... 67.0 66 68 67 66 64 61 58 55 55
3 State Bihar NaN NaN NaN NaN NaN NaN NaN NaN ... 60.0 61 61 60 58 56 52 48 44 43
4 State Chhattisgarh NaN NaN NaN NaN NaN NaN NaN NaN ... 70.0 60 63 61 59 57 54 51 48 47

5 rows × 44 columns

In [19]:
data.tail()
Out[19]:
Category Country/ State/ UT Name Infant mortality rate - 1971 Infant mortality rate - 1972 Infant mortality rate - 1973 Infant mortality rate - 1974 Infant mortality rate - 1975 Infant mortality rate - 1976 Infant mortality rate - 1977 Infant mortality rate - 1978 ... Infant mortality rate - 2003 Infant mortality rate - 2004 Infant mortality rate - 2005 Infant mortality rate - 2006 Infant mortality rate - 2007 Infant mortality rate - 2008 Infant mortality rate - 2009 Infant mortality rate - 2010 Infant mortality rate - 2011 Infant mortality rate - 2012
31 Union Territory Dadra and Nagar Haveli NaN NaN NaN NaN NaN NaN NaN NaN ... 54.0 48 42 35 34 34 37 38 35 33
32 Union Territory Daman and Diu NaN NaN NaN NaN NaN NaN NaN NaN ... 39.0 37 28 28 27 31 24 23 22 22
33 Union Territory Delhi NaN NaN NaN NaN NaN NaN NaN NaN ... 28.0 32 35 37 36 35 33 30 28 25
34 Union Territory Lakshadweep NaN NaN NaN NaN NaN NaN NaN NaN ... 26.0 30 22 25 24 31 25 25 24 24
35 Union Territory Puducherry NaN NaN NaN NaN NaN NaN NaN NaN ... 24.0 24 28 28 25 25 22 22 19 17

5 rows × 44 columns

In [20]:
data.shape
Out[20]:
(36, 44)
In [21]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 44 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Category                      36 non-null     object 
 1   Country/ State/ UT Name       36 non-null     object 
 2   Infant mortality rate - 1971  13 non-null     float64
 3   Infant mortality rate - 1972  14 non-null     float64
 4   Infant mortality rate - 1973  16 non-null     float64
 5   Infant mortality rate - 1974  16 non-null     float64
 6   Infant mortality rate - 1975  16 non-null     float64
 7   Infant mortality rate - 1976  16 non-null     float64
 8   Infant mortality rate - 1977  16 non-null     float64
 9   Infant mortality rate - 1978  16 non-null     float64
 10  Infant mortality rate - 1979  16 non-null     float64
 11  Infant mortality rate - 1980  16 non-null     float64
 12  Infant mortality rate - 1981  18 non-null     float64
 13  Infant mortality rate - 1982  18 non-null     float64
 14  Infant mortality rate - 1983  18 non-null     float64
 15  Infant mortality rate - 1984  18 non-null     float64
 16  Infant mortality rate - 1985  18 non-null     float64
 17  Infant mortality rate - 1986  18 non-null     float64
 18  Infant mortality rate - 1987  18 non-null     float64
 19  Infant mortality rate - 1988  18 non-null     float64
 20  Infant mortality rate - 1989  18 non-null     float64
 21  Infant mortality rate - 1990  18 non-null     float64
 22  Infant mortality rate - 1991  17 non-null     float64
 23  Infant mortality rate - 1992  17 non-null     float64
 24  Infant mortality rate - 1993  17 non-null     float64
 25  Infant mortality rate - 1994  17 non-null     float64
 26  Infant mortality rate - 1995  17 non-null     float64
 27  Infant mortality rate - 1996  17 non-null     float64
 28  Infant mortality rate - 1997  33 non-null     float64
 29  Infant mortality rate - 1998  34 non-null     float64
 30  Infant mortality rate - 1999  35 non-null     float64
 31  Infant mortality rate - 2000  35 non-null     float64
 32  Infant mortality rate - 2001  35 non-null     float64
 33  Infant mortality rate - 2002  35 non-null     float64
 34  Infant mortality rate - 2003  35 non-null     float64
 35  Infant mortality rate - 2004  36 non-null     int64  
 36  Infant mortality rate - 2005  36 non-null     int64  
 37  Infant mortality rate - 2006  36 non-null     int64  
 38  Infant mortality rate - 2007  36 non-null     int64  
 39  Infant mortality rate - 2008  36 non-null     int64  
 40  Infant mortality rate - 2009  36 non-null     int64  
 41  Infant mortality rate - 2010  36 non-null     int64  
 42  Infant mortality rate - 2011  36 non-null     int64  
 43  Infant mortality rate - 2012  36 non-null     int64  
dtypes: float64(33), int64(9), object(2)
memory usage: 12.5+ KB
In [22]:
data.isnull().sum()
Out[22]:
Category                         0
Country/ State/ UT Name          0
Infant mortality rate - 1971    23
Infant mortality rate - 1972    22
Infant mortality rate - 1973    20
Infant mortality rate - 1974    20
Infant mortality rate - 1975    20
Infant mortality rate - 1976    20
Infant mortality rate - 1977    20
Infant mortality rate - 1978    20
Infant mortality rate - 1979    20
Infant mortality rate - 1980    20
Infant mortality rate - 1981    18
Infant mortality rate - 1982    18
Infant mortality rate - 1983    18
Infant mortality rate - 1984    18
Infant mortality rate - 1985    18
Infant mortality rate - 1986    18
Infant mortality rate - 1987    18
Infant mortality rate - 1988    18
Infant mortality rate - 1989    18
Infant mortality rate - 1990    18
Infant mortality rate - 1991    19
Infant mortality rate - 1992    19
Infant mortality rate - 1993    19
Infant mortality rate - 1994    19
Infant mortality rate - 1995    19
Infant mortality rate - 1996    19
Infant mortality rate - 1997     3
Infant mortality rate - 1998     2
Infant mortality rate - 1999     1
Infant mortality rate - 2000     1
Infant mortality rate - 2001     1
Infant mortality rate - 2002     1
Infant mortality rate - 2003     1
Infant mortality rate - 2004     0
Infant mortality rate - 2005     0
Infant mortality rate - 2006     0
Infant mortality rate - 2007     0
Infant mortality rate - 2008     0
Infant mortality rate - 2009     0
Infant mortality rate - 2010     0
Infant mortality rate - 2011     0
Infant mortality rate - 2012     0
dtype: int64
In [23]:
data.isnull().sum()/data.shape[0]*100
Out[23]:
Category                         0.000000
Country/ State/ UT Name          0.000000
Infant mortality rate - 1971    63.888889
Infant mortality rate - 1972    61.111111
Infant mortality rate - 1973    55.555556
Infant mortality rate - 1974    55.555556
Infant mortality rate - 1975    55.555556
Infant mortality rate - 1976    55.555556
Infant mortality rate - 1977    55.555556
Infant mortality rate - 1978    55.555556
Infant mortality rate - 1979    55.555556
Infant mortality rate - 1980    55.555556
Infant mortality rate - 1981    50.000000
Infant mortality rate - 1982    50.000000
Infant mortality rate - 1983    50.000000
Infant mortality rate - 1984    50.000000
Infant mortality rate - 1985    50.000000
Infant mortality rate - 1986    50.000000
Infant mortality rate - 1987    50.000000
Infant mortality rate - 1988    50.000000
Infant mortality rate - 1989    50.000000
Infant mortality rate - 1990    50.000000
Infant mortality rate - 1991    52.777778
Infant mortality rate - 1992    52.777778
Infant mortality rate - 1993    52.777778
Infant mortality rate - 1994    52.777778
Infant mortality rate - 1995    52.777778
Infant mortality rate - 1996    52.777778
Infant mortality rate - 1997     8.333333
Infant mortality rate - 1998     5.555556
Infant mortality rate - 1999     2.777778
Infant mortality rate - 2000     2.777778
Infant mortality rate - 2001     2.777778
Infant mortality rate - 2002     2.777778
Infant mortality rate - 2003     2.777778
Infant mortality rate - 2004     0.000000
Infant mortality rate - 2005     0.000000
Infant mortality rate - 2006     0.000000
Infant mortality rate - 2007     0.000000
Infant mortality rate - 2008     0.000000
Infant mortality rate - 2009     0.000000
Infant mortality rate - 2010     0.000000
Infant mortality rate - 2011     0.000000
Infant mortality rate - 2012     0.000000
dtype: float64
In [24]:
data.duplicated().sum()
Out[24]:
0
In [25]:
for i in data.select_dtypes(include="object").columns:
    print(data[i].value_counts())
    print("***"*10)
State              28
Union Territory     7
Country             1
Name: Category, dtype: int64
******************************
India (Average)                1
Andhra Pradesh                 1
Himachal Pradesh               1
Jammu and Kashmir              1
Manipur                        1
Meghalaya                      1
Mizoram                        1
Nagaland                       1
Sikkim                         1
Tripura                        1
Uttarakhand                    1
Andaman and Nicobar Islands    1
Chandigarh                     1
Dadra and Nagar Haveli         1
Daman and Diu                  1
Delhi                          1
Lakshadweep                    1
Goa                            1
Arunachal Pradesh              1
West Bengal                    1
Karnataka                      1
Assam                          1
Bihar                          1
Chhattisgarh                   1
Gujarat                        1
Haryana                        1
Jharkhand                      1
Kerala                         1
Uttar Pradesh                  1
Madhya Pradesh                 1
Maharashtra                    1
Odisha                         1
Punjab                         1
Rajasthan                      1
Tamil Nadu                     1
Puducherry                     1
Name: Country/ State/ UT Name, dtype: int64
******************************
In [26]:
data.describe().T
Out[26]:
count mean std min 25% 50% 75% max
Infant mortality rate - 1971 13.0 114.769231 29.877957 58.0 102.00 113.0 135.00 167.0
Infant mortality rate - 1972 14.0 123.142857 32.370112 63.0 104.75 122.0 134.75 202.0
Infant mortality rate - 1973 16.0 117.500000 33.033317 58.0 100.50 115.5 139.00 176.0
Infant mortality rate - 1974 16.0 111.312500 29.756162 54.0 95.00 106.0 133.75 172.0
Infant mortality rate - 1975 16.0 121.875000 37.618923 54.0 96.50 120.5 149.50 198.0
Infant mortality rate - 1976 16.0 116.187500 30.824706 56.0 103.25 123.0 131.25 178.0
Infant mortality rate - 1977 16.0 114.687500 32.097183 47.0 102.50 114.0 139.00 168.0
Infant mortality rate - 1978 16.0 111.687500 31.918059 42.0 96.25 117.0 128.50 177.0
Infant mortality rate - 1979 16.0 105.125000 29.734099 43.0 86.75 102.0 120.75 162.0
Infant mortality rate - 1980 16.0 100.062500 30.281939 40.0 84.00 98.0 113.25 159.0
Infant mortality rate - 1981 18.0 97.944444 28.603727 37.0 79.50 96.0 114.50 150.0
Infant mortality rate - 1982 18.0 92.055556 28.930896 30.0 71.25 89.5 109.50 147.0
Infant mortality rate - 1983 18.0 92.888889 26.614655 33.0 79.25 89.0 105.75 155.0
Infant mortality rate - 1984 18.0 93.611111 28.263755 29.0 78.00 92.5 105.50 155.0
Infant mortality rate - 1985 18.0 91.500000 26.373449 31.0 75.75 85.0 107.50 142.0
Infant mortality rate - 1986 18.0 89.500000 25.245267 27.0 74.75 86.5 107.00 132.0
Infant mortality rate - 1987 18.0 87.055556 24.773022 28.0 72.00 84.5 101.75 127.0
Infant mortality rate - 1988 18.0 86.055556 23.949634 28.0 71.75 86.5 98.50 124.0
Infant mortality rate - 1989 18.0 82.444444 23.761616 21.0 69.75 81.5 91.00 121.0
Infant mortality rate - 1990 18.0 73.611111 22.437204 17.0 64.50 70.0 79.00 122.0
Infant mortality rate - 1991 17.0 74.470588 24.212904 16.0 68.00 73.0 80.00 124.0
Infant mortality rate - 1992 17.0 73.235294 21.870441 17.0 65.00 73.0 79.00 115.0
Infant mortality rate - 1993 17.0 68.823529 22.589365 13.0 58.00 66.0 81.00 110.0
Infant mortality rate - 1994 17.0 68.352941 19.751396 16.0 59.00 67.0 78.00 103.0
Infant mortality rate - 1995 17.0 68.058824 20.225203 15.0 58.00 67.0 77.00 103.0
Infant mortality rate - 1996 17.0 65.294118 19.986760 14.0 53.00 65.0 74.00 97.0
Infant mortality rate - 1997 33.0 54.545455 22.207953 12.0 38.00 53.0 68.00 96.0
Infant mortality rate - 1998 34.0 54.588235 22.548423 16.0 38.00 53.0 67.75 98.0
Infant mortality rate - 1999 35.0 52.171429 21.557701 14.0 33.50 52.0 67.00 97.0
Infant mortality rate - 2000 35.0 51.828571 20.933348 14.0 36.50 51.0 66.00 95.0
Infant mortality rate - 2001 35.0 49.600000 21.445142 11.0 36.00 49.0 64.00 91.0
Infant mortality rate - 2002 35.0 47.285714 21.693821 10.0 32.00 49.0 61.50 87.0
Infant mortality rate - 2003 35.0 45.200000 20.130165 11.0 30.00 45.0 59.00 83.0
Infant mortality rate - 2004 36.0 43.361111 18.547472 12.0 31.50 43.5 58.25 79.0
Infant mortality rate - 2005 36.0 42.722222 18.444683 13.0 28.00 42.0 57.25 76.0
Infant mortality rate - 2006 36.0 42.972222 17.327907 11.0 30.25 41.5 56.25 74.0
Infant mortality rate - 2007 36.0 42.250000 16.858021 12.0 32.25 41.0 55.00 72.0
Infant mortality rate - 2008 36.0 41.500000 15.756178 10.0 31.00 39.0 53.25 70.0
Infant mortality rate - 2009 36.0 39.305556 15.053845 11.0 27.75 37.5 50.25 67.0
Infant mortality rate - 2010 36.0 36.777778 14.241009 10.0 25.00 37.5 47.25 62.0
Infant mortality rate - 2011 36.0 34.472222 13.476935 11.0 23.75 34.5 44.00 59.0
Infant mortality rate - 2012 36.0 33.000000 12.855238 10.0 24.00 33.0 42.00 56.0
In [27]:
data.describe(include="object")
Out[27]:
Category Country/ State/ UT Name
count 36 36
unique 3 36
top State India (Average)
freq 28 1
In [28]:
import warnings
warnings.filterwarnings("ignore")
sns.distplot(data['Infant mortality rate - 1971'])
Out[28]:
<AxesSubplot:xlabel='Infant mortality rate - 1971', ylabel='Density'>
In [29]:
#Deviate from the normal distribution.
#Have appreciable positive skewness.
#Show peakedness.
print('Skewness: %f' % data['Infant mortality rate - 1971'].skew())
print('Kurtsis: %f' %data['Infant mortality rate - 1971'].kurt())
Skewness: -0.287734
Kurtsis: -0.004168
In [30]:
import warnings
warnings.filterwarnings("ignore")
for i in data.select_dtypes(include="number").columns:
    sns.histplot(data=data,x=i)
    plt.show()
In [31]:
import warnings
warnings.filterwarnings("ignore")
for i in data.select_dtypes(include="number").columns:
    sns.boxplot(data=data,x=i)
    plt.show()
In [32]:
import warnings
warnings.filterwarnings("ignore")
for i in data.select_dtypes(include="number").columns:
    plt.figure(figsize=(14,5))
    sns.kdeplot(data=data,x=i)
    plt.show()
In [33]:
for i in ['Infant mortality rate - 1971', 'Infant mortality rate - 1972', 'Infant mortality rate - 1974',
       'Infant mortality rate - 1975', 'Infant mortality rate - 1976',
       'Infant mortality rate - 1977', 'Infant mortality rate - 1978',
       'Infant mortality rate - 1979', 'Infant mortality rate - 1980',
       'Infant mortality rate - 1981', 'Infant mortality rate - 1982',
       'Infant mortality rate - 1983', 'Infant mortality rate - 1984',
       'Infant mortality rate - 1985', 'Infant mortality rate - 1986',
       'Infant mortality rate - 1987', 'Infant mortality rate - 1988',
       'Infant mortality rate - 1989', 'Infant mortality rate - 1990',
       'Infant mortality rate - 1991', 'Infant mortality rate - 1992',
       'Infant mortality rate - 1993', 'Infant mortality rate - 1994',
       'Infant mortality rate - 1995', 'Infant mortality rate - 1996',
       'Infant mortality rate - 1997', 'Infant mortality rate - 1998',
       'Infant mortality rate - 1999', 'Infant mortality rate - 2000',
       'Infant mortality rate - 2001', 'Infant mortality rate - 2002',
       'Infant mortality rate - 2003', 'Infant mortality rate - 2004',
       'Infant mortality rate - 2005', 'Infant mortality rate - 2006',
       'Infant mortality rate - 2007', 'Infant mortality rate - 2008',
       'Infant mortality rate - 2009', 'Infant mortality rate - 2010',
       'Infant mortality rate - 2011', 'Infant mortality rate - 2012']:
    sns.scatterplot(data=data,x=i,y='Infant mortality rate - 1973')
    plt.show()
In [34]:
data.select_dtypes(include="number").columns
Out[34]:
Index(['Infant mortality rate - 1971', 'Infant mortality rate - 1972',
       'Infant mortality rate - 1973', 'Infant mortality rate - 1974',
       'Infant mortality rate - 1975', 'Infant mortality rate - 1976',
       'Infant mortality rate - 1977', 'Infant mortality rate - 1978',
       'Infant mortality rate - 1979', 'Infant mortality rate - 1980',
       'Infant mortality rate - 1981', 'Infant mortality rate - 1982',
       'Infant mortality rate - 1983', 'Infant mortality rate - 1984',
       'Infant mortality rate - 1985', 'Infant mortality rate - 1986',
       'Infant mortality rate - 1987', 'Infant mortality rate - 1988',
       'Infant mortality rate - 1989', 'Infant mortality rate - 1990',
       'Infant mortality rate - 1991', 'Infant mortality rate - 1992',
       'Infant mortality rate - 1993', 'Infant mortality rate - 1994',
       'Infant mortality rate - 1995', 'Infant mortality rate - 1996',
       'Infant mortality rate - 1997', 'Infant mortality rate - 1998',
       'Infant mortality rate - 1999', 'Infant mortality rate - 2000',
       'Infant mortality rate - 2001', 'Infant mortality rate - 2002',
       'Infant mortality rate - 2003', 'Infant mortality rate - 2004',
       'Infant mortality rate - 2005', 'Infant mortality rate - 2006',
       'Infant mortality rate - 2007', 'Infant mortality rate - 2008',
       'Infant mortality rate - 2009', 'Infant mortality rate - 2010',
       'Infant mortality rate - 2011', 'Infant mortality rate - 2012'],
      dtype='object')
In [35]:
s=data.select_dtypes(include="number").corr()
In [36]:
plt.figure(figsize=(30,30))
sns.heatmap(s,annot=True)
Out[36]:
<AxesSubplot:>
In [37]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')
Out[37]:
  Infant mortality rate - 1971 Infant mortality rate - 1972 Infant mortality rate - 1973 Infant mortality rate - 1974 Infant mortality rate - 1975 Infant mortality rate - 1976 Infant mortality rate - 1977 Infant mortality rate - 1978 Infant mortality rate - 1979 Infant mortality rate - 1980 Infant mortality rate - 1981 Infant mortality rate - 1982 Infant mortality rate - 1983 Infant mortality rate - 1984 Infant mortality rate - 1985 Infant mortality rate - 1986 Infant mortality rate - 1987 Infant mortality rate - 1988 Infant mortality rate - 1989 Infant mortality rate - 1990 Infant mortality rate - 1991 Infant mortality rate - 1992 Infant mortality rate - 1993 Infant mortality rate - 1994 Infant mortality rate - 1995 Infant mortality rate - 1996 Infant mortality rate - 1997 Infant mortality rate - 1998 Infant mortality rate - 1999 Infant mortality rate - 2000 Infant mortality rate - 2001 Infant mortality rate - 2002 Infant mortality rate - 2003 Infant mortality rate - 2004 Infant mortality rate - 2005 Infant mortality rate - 2006 Infant mortality rate - 2007 Infant mortality rate - 2008 Infant mortality rate - 2009 Infant mortality rate - 2010 Infant mortality rate - 2011 Infant mortality rate - 2012
Infant mortality rate - 1971 1.000000 0.919844 0.938753 0.849215 0.903520 0.875915 0.844169 0.838585 0.842257 0.809169 0.830918 0.842462 0.852202 0.806424 0.852962 0.867119 0.814594 0.780232 0.766535 0.703166 0.673207 0.670230 0.694482 0.698468 0.688100 0.711382 0.722708 0.697900 0.709734 0.712501 0.709655 0.726023 0.725225 0.681149 0.697323 0.712676 0.717685 0.703589 0.703417 0.693944 0.697818 0.690637
Infant mortality rate - 1972 0.919844 1.000000 0.885500 0.905214 0.902494 0.899978 0.858821 0.918628 0.892320 0.890539 0.878405 0.886080 0.910906 0.837379 0.876398 0.853427 0.823560 0.811308 0.799212 0.732461 0.691681 0.705637 0.740356 0.716630 0.707883 0.742343 0.742170 0.728297 0.728588 0.730600 0.732776 0.739112 0.740161 0.728561 0.730167 0.740032 0.738995 0.727175 0.725021 0.724636 0.722968 0.711218
Infant mortality rate - 1973 0.938753 0.885500 1.000000 0.867360 0.922041 0.870094 0.935448 0.888091 0.897630 0.879025 0.910611 0.903980 0.901764 0.835858 0.819330 0.826992 0.829224 0.808988 0.792523 0.700662 0.680676 0.725438 0.715766 0.756589 0.740056 0.753637 0.763347 0.754122 0.768024 0.779673 0.781250 0.758101 0.790324 0.703641 0.713404 0.695464 0.699770 0.698895 0.696566 0.687523 0.672007 0.661163
Infant mortality rate - 1974 0.849215 0.905214 0.867360 1.000000 0.941439 0.883612 0.914509 0.942875 0.928553 0.937968 0.923830 0.927008 0.938717 0.921458 0.945638 0.919250 0.932671 0.932234 0.914157 0.843153 0.832221 0.876743 0.894108 0.889398 0.886411 0.904697 0.910126 0.895432 0.900775 0.904929 0.895114 0.891124 0.892945 0.868770 0.867649 0.861163 0.855334 0.848303 0.847587 0.846268 0.838983 0.832381
Infant mortality rate - 1975 0.903520 0.902494 0.922041 0.941439 1.000000 0.972605 0.951162 0.951114 0.911659 0.917397 0.912257 0.911001 0.927014 0.915183 0.896045 0.909653 0.898038 0.886264 0.855069 0.743084 0.731176 0.786303 0.782339 0.797520 0.800231 0.845783 0.852505 0.838161 0.821795 0.825411 0.813840 0.838796 0.822564 0.801427 0.805409 0.798265 0.791353 0.784438 0.795617 0.789349 0.783962 0.773737
Infant mortality rate - 1976 0.875915 0.899978 0.870094 0.883612 0.972605 1.000000 0.925620 0.951549 0.869470 0.875608 0.847762 0.848808 0.882254 0.866717 0.830491 0.865315 0.841512 0.829782 0.815718 0.690130 0.681232 0.741028 0.733834 0.739887 0.748013 0.804359 0.810586 0.806248 0.767658 0.767964 0.757916 0.813477 0.769578 0.766545 0.762633 0.756466 0.747302 0.736278 0.751757 0.735622 0.728340 0.709942
Infant mortality rate - 1977 0.844169 0.858821 0.935448 0.914509 0.951162 0.925620 1.000000 0.949909 0.939920 0.925295 0.925697 0.913519 0.927093 0.893772 0.859713 0.864564 0.883801 0.888497 0.872950 0.794694 0.800614 0.858327 0.828957 0.860156 0.867650 0.891360 0.889848 0.872181 0.859598 0.865333 0.859111 0.869676 0.867472 0.816960 0.806140 0.789402 0.784373 0.781542 0.785576 0.769280 0.755040 0.739052
Infant mortality rate - 1978 0.838585 0.918628 0.888091 0.942875 0.951114 0.951549 0.949909 1.000000 0.927142 0.935523 0.916592 0.912307 0.939111 0.901021 0.892228 0.890719 0.883011 0.886787 0.881087 0.792109 0.750683 0.821471 0.822287 0.828898 0.833210 0.871898 0.873136 0.863996 0.860439 0.862679 0.865072 0.873139 0.872000 0.847042 0.839442 0.832620 0.827329 0.819264 0.818217 0.805993 0.793376 0.775312
Infant mortality rate - 1979 0.842257 0.892320 0.897630 0.928553 0.911659 0.869470 0.939920 0.927142 1.000000 0.987767 0.976586 0.981289 0.969964 0.922647 0.931564 0.919916 0.941061 0.934841 0.935131 0.890624 0.864598 0.883436 0.874114 0.883571 0.871360 0.882148 0.881054 0.882901 0.893384 0.896761 0.884366 0.873839 0.888147 0.858661 0.844709 0.833025 0.829629 0.823803 0.820653 0.811903 0.800510 0.778092
Infant mortality rate - 1980 0.809169 0.890539 0.879025 0.937968 0.917397 0.875608 0.925295 0.935523 0.987767 1.000000 0.983780 0.988316 0.977641 0.939589 0.946278 0.930581 0.949657 0.944791 0.934686 0.889658 0.857750 0.882898 0.884653 0.884365 0.877350 0.898738 0.898384 0.897992 0.897565 0.898716 0.881374 0.882487 0.888853 0.875310 0.862330 0.852908 0.847524 0.842323 0.843715 0.836464 0.826309 0.807073
Infant mortality rate - 1981 0.830918 0.878405 0.910611 0.923830 0.912257 0.847762 0.925697 0.916592 0.976586 0.983780 1.000000 0.996874 0.967250 0.924109 0.939495 0.924702 0.947024 0.940170 0.926006 0.865928 0.817489 0.859760 0.860364 0.876971 0.873681 0.891957 0.892605 0.872968 0.886019 0.890668 0.887230 0.861863 0.898778 0.863148 0.854254 0.843092 0.843017 0.840504 0.835000 0.830042 0.825379 0.816422
Infant mortality rate - 1982 0.842462 0.886080 0.903980 0.927008 0.911001 0.848808 0.913519 0.912307 0.981289 0.988316 0.996874 1.000000 0.969010 0.927311 0.948607 0.934460 0.953459 0.942008 0.933684 0.875236 0.829204 0.862658 0.866245 0.876356 0.871436 0.887418 0.890071 0.871690 0.885258 0.889378 0.882817 0.863111 0.893608 0.865173 0.857774 0.848249 0.848456 0.845345 0.841052 0.837445 0.833799 0.824043
Infant mortality rate - 1983 0.852202 0.910906 0.901764 0.938717 0.927014 0.882254 0.927093 0.939111 0.969964 0.977641 0.967250 0.969010 1.000000 0.970231 0.946563 0.932307 0.942685 0.936150 0.924096 0.871601 0.820401 0.873967 0.864313 0.873996 0.864557 0.881777 0.884728 0.871404 0.878973 0.881713 0.870834 0.872961 0.880729 0.844219 0.838367 0.830281 0.828555 0.823459 0.820541 0.815091 0.809820 0.790432
Infant mortality rate - 1984 0.806424 0.837379 0.835858 0.921458 0.915183 0.866717 0.893772 0.901021 0.922647 0.939589 0.924109 0.927311 0.970231 1.000000 0.954189 0.952313 0.958444 0.957417 0.935276 0.885496 0.841572 0.907061 0.883151 0.899094 0.894165 0.912297 0.918162 0.899985 0.907046 0.907158 0.889253 0.910112 0.897581 0.883908 0.886575 0.881142 0.877715 0.874693 0.876846 0.875291 0.874706 0.856027
Infant mortality rate - 1985 0.852962 0.876398 0.819330 0.945638 0.896045 0.830491 0.859713 0.892228 0.931564 0.946278 0.939495 0.948607 0.946563 0.954189 1.000000 0.985671 0.982133 0.974829 0.950111 0.917971 0.867024 0.909705 0.922986 0.915784 0.923191 0.938408 0.945204 0.909646 0.927503 0.927088 0.910024 0.917903 0.922292 0.926812 0.928047 0.931230 0.929464 0.922969 0.921303 0.918634 0.916384 0.906722
Infant mortality rate - 1986 0.867119 0.853427 0.826992 0.919250 0.909653 0.865315 0.864564 0.890719 0.919916 0.930581 0.924702 0.934460 0.932307 0.952313 0.985671 1.000000 0.985108 0.973250 0.951974 0.906238 0.860084 0.901509 0.909630 0.910853 0.916209 0.935823 0.945193 0.919020 0.928218 0.926061 0.907243 0.930735 0.920916 0.933803 0.935680 0.938567 0.937091 0.928527 0.930964 0.921622 0.919783 0.907303
Infant mortality rate - 1987 0.814594 0.823560 0.829224 0.932671 0.898038 0.841512 0.883801 0.883011 0.941061 0.949657 0.947024 0.953459 0.942685 0.958444 0.982133 0.985108 1.000000 0.994325 0.978071 0.936519 0.904702 0.940929 0.943483 0.944935 0.949041 0.957757 0.963164 0.949195 0.951569 0.950833 0.932359 0.947650 0.943490 0.946186 0.943086 0.938375 0.935602 0.929941 0.930999 0.922586 0.919437 0.907500
Infant mortality rate - 1988 0.780232 0.811308 0.808988 0.932234 0.886264 0.829782 0.888497 0.886787 0.934841 0.944791 0.940170 0.942008 0.936150 0.957417 0.974829 0.973250 0.994325 1.000000 0.980377 0.943977 0.913937 0.956123 0.955719 0.959170 0.965941 0.974802 0.976211 0.964286 0.967550 0.966657 0.951034 0.962267 0.960213 0.964411 0.958893 0.952729 0.949252 0.945121 0.945291 0.937467 0.933329 0.921859
Infant mortality rate - 1989 0.766535 0.799212 0.792523 0.914157 0.855069 0.815718 0.872950 0.881087 0.935131 0.934686 0.926006 0.933684 0.924096 0.935276 0.950111 0.951974 0.978071 0.980377 1.000000 0.964104 0.949347 0.975648 0.971420 0.975150 0.971271 0.968905 0.970942 0.963396 0.970797 0.969301 0.960339 0.967658 0.965549 0.956022 0.945209 0.938030 0.937073 0.931715 0.928582 0.919117 0.920332 0.904692
Infant mortality rate - 1990 0.703166 0.732461 0.700662 0.843153 0.743084 0.690130 0.794694 0.792109 0.890624 0.889658 0.865928 0.875236 0.871601 0.885496 0.917971 0.906238 0.936519 0.943977 0.964104 1.000000 0.980766 0.986918 0.984215 0.976732 0.977134 0.959793 0.958363 0.945918 0.959260 0.953773 0.931496 0.945774 0.941075 0.934715 0.917474 0.916262 0.915736 0.910882 0.907453 0.894565 0.893806 0.872675
Infant mortality rate - 1991 0.673207 0.691681 0.680676 0.832221 0.731176 0.681232 0.800614 0.750683 0.864598 0.857750 0.817489 0.829204 0.820401 0.841572 0.867024 0.860084 0.904702 0.913937 0.949347 0.980766 1.000000 0.973016 0.973849 0.963323 0.954712 0.929439 0.926420 0.946082 0.934236 0.928335 0.901546 0.931536 0.905353 0.901418 0.880125 0.878823 0.878950 0.873996 0.875425 0.864536 0.873955 0.857758
Infant mortality rate - 1992 0.670230 0.705637 0.725438 0.876743 0.786303 0.741028 0.858327 0.821471 0.883436 0.882898 0.859760 0.862658 0.873967 0.907061 0.909705 0.901509 0.940929 0.956123 0.975648 0.986918 0.973016 1.000000 0.985082 0.989444 0.987343 0.970820 0.970370 0.975915 0.975980 0.972625 0.955618 0.971710 0.958481 0.947344 0.933197 0.929855 0.929184 0.925582 0.919677 0.909950 0.911676 0.892336
Infant mortality rate - 1993 0.694482 0.740356 0.715766 0.894108 0.782339 0.733834 0.828957 0.822287 0.874114 0.884653 0.860364 0.866245 0.864313 0.883151 0.922986 0.909630 0.943483 0.955719 0.971420 0.984215 0.973849 0.985082 1.000000 0.985756 0.985935 0.973154 0.972884 0.980063 0.973082 0.968346 0.949647 0.968520 0.955071 0.955280 0.940545 0.941050 0.940341 0.934321 0.930544 0.920315 0.922856 0.909496
Infant mortality rate - 1994 0.698468 0.716630 0.756589 0.889398 0.797520 0.739887 0.860156 0.828898 0.883571 0.884365 0.876971 0.876356 0.873996 0.899094 0.915784 0.910853 0.944935 0.959170 0.975150 0.976732 0.963323 0.989444 0.985756 1.000000 0.989522 0.974823 0.976327 0.975896 0.987028 0.985201 0.974231 0.969842 0.973371 0.952679 0.939221 0.935459 0.937567 0.932920 0.923720 0.913582 0.917087 0.904275
Infant mortality rate - 1995 0.688100 0.707883 0.740056 0.886411 0.800231 0.748013 0.867650 0.833210 0.871360 0.877350 0.873681 0.871436 0.864557 0.894165 0.923191 0.916209 0.949041 0.965941 0.971271 0.977134 0.954712 0.987343 0.985935 0.989522 1.000000 0.991330 0.990756 0.989067 0.986534 0.982957 0.968087 0.983084 0.976271 0.970860 0.958038 0.957517 0.957221 0.953587 0.948511 0.935921 0.936545 0.926044
Infant mortality rate - 1996 0.711382 0.742343 0.753637 0.904697 0.845783 0.804359 0.891360 0.871898 0.882148 0.898738 0.891957 0.887418 0.881777 0.912297 0.938408 0.935823 0.957757 0.974802 0.968905 0.959793 0.929439 0.970820 0.973154 0.974823 0.991330 1.000000 0.998244 0.992777 0.981086 0.975888 0.960076 0.983973 0.971672 0.978875 0.964940 0.967313 0.964581 0.959497 0.958440 0.945837 0.948334 0.939117
Infant mortality rate - 1997 0.722708 0.742170 0.763347 0.910126 0.852505 0.810586 0.889848 0.873136 0.881054 0.898384 0.892605 0.890071 0.884728 0.918162 0.945204 0.945193 0.963164 0.976211 0.970942 0.958363 0.926420 0.970370 0.972884 0.976327 0.990756 0.998244 1.000000 0.981240 0.980634 0.970746 0.966260 0.954451 0.954188 0.949290 0.941617 0.930139 0.931721 0.898568 0.900558 0.884459 0.886631 0.875388
Infant mortality rate - 1998 0.697900 0.728297 0.754122 0.895432 0.838161 0.806248 0.872181 0.863996 0.882901 0.897992 0.872968 0.871690 0.871404 0.899985 0.909646 0.919020 0.949195 0.964286 0.963396 0.945918 0.946082 0.975915 0.980063 0.975896 0.989067 0.992777 0.981240 1.000000 0.970779 0.976499 0.963705 0.968183 0.959160 0.948150 0.937376 0.918657 0.912560 0.882769 0.885592 0.868108 0.868469 0.855466
Infant mortality rate - 1999 0.709734 0.728588 0.768024 0.900775 0.821795 0.767658 0.859598 0.860439 0.893384 0.897565 0.886019 0.885258 0.878973 0.907046 0.927503 0.928218 0.951569 0.967550 0.970797 0.959260 0.934236 0.975980 0.973082 0.987028 0.986534 0.981086 0.980634 0.970779 1.000000 0.991702 0.989753 0.969218 0.976628 0.967701 0.959685 0.942411 0.936226 0.908297 0.912085 0.898805 0.896377 0.880714
Infant mortality rate - 2000 0.712501 0.730600 0.779673 0.904929 0.825411 0.767964 0.865333 0.862679 0.896761 0.898716 0.890668 0.889378 0.881713 0.907158 0.927088 0.926061 0.950833 0.966657 0.969301 0.953773 0.928335 0.972625 0.968346 0.985201 0.982957 0.975888 0.970746 0.976499 0.991702 1.000000 0.991114 0.975032 0.982194 0.967339 0.951908 0.931829 0.923958 0.901087 0.902922 0.892191 0.890289 0.875553
Infant mortality rate - 2001 0.709655 0.732776 0.781250 0.895114 0.813840 0.757916 0.859111 0.865072 0.884366 0.881374 0.887230 0.882817 0.870834 0.889253 0.910024 0.907243 0.932359 0.951034 0.960339 0.931496 0.901546 0.955618 0.949647 0.974231 0.968087 0.960076 0.966260 0.963705 0.989753 0.991114 1.000000 0.976879 0.991701 0.974530 0.957385 0.935547 0.926616 0.910441 0.907944 0.900676 0.898857 0.883466
Infant mortality rate - 2002 0.726023 0.739112 0.758101 0.891124 0.838796 0.813477 0.869676 0.873139 0.873839 0.882487 0.861863 0.863111 0.872961 0.910112 0.917903 0.930735 0.947650 0.962267 0.967658 0.945774 0.931536 0.971710 0.968520 0.969842 0.983084 0.983973 0.954451 0.968183 0.969218 0.975032 0.976879 1.000000 0.988631 0.984252 0.964106 0.944376 0.930236 0.910878 0.915545 0.903567 0.903692 0.885509
Infant mortality rate - 2003 0.725225 0.740161 0.790324 0.892945 0.822564 0.769578 0.867472 0.872000 0.888147 0.888853 0.898778 0.893608 0.880729 0.897581 0.922292 0.920916 0.943490 0.960213 0.965549 0.941075 0.905353 0.958481 0.955071 0.973371 0.976271 0.971672 0.954188 0.959160 0.976628 0.982194 0.991701 0.988631 1.000000 0.983561 0.968825 0.946891 0.935352 0.921067 0.919601 0.912692 0.909744 0.892587
Infant mortality rate - 2004 0.681149 0.728561 0.703641 0.868770 0.801427 0.766545 0.816960 0.847042 0.858661 0.875310 0.863148 0.865173 0.844219 0.883908 0.926812 0.933803 0.946186 0.964411 0.956022 0.934715 0.901418 0.947344 0.955280 0.952679 0.970860 0.978875 0.949290 0.948150 0.967701 0.967339 0.974530 0.984252 0.983561 1.000000 0.984970 0.973308 0.960266 0.944585 0.940920 0.934684 0.934865 0.921736
Infant mortality rate - 2005 0.697323 0.730167 0.713404 0.867649 0.805409 0.762633 0.806140 0.839442 0.844709 0.862330 0.854254 0.857774 0.838367 0.886575 0.928047 0.935680 0.943086 0.958893 0.945209 0.917474 0.880125 0.933197 0.940545 0.939221 0.958038 0.964940 0.941617 0.937376 0.959685 0.951908 0.957385 0.964106 0.968825 0.984970 1.000000 0.991727 0.979286 0.954912 0.951312 0.944884 0.942473 0.929644
Infant mortality rate - 2006 0.712676 0.740032 0.695464 0.861163 0.798265 0.756466 0.789402 0.832620 0.833025 0.852908 0.843092 0.848249 0.830281 0.881142 0.931230 0.938567 0.938375 0.952729 0.938030 0.916262 0.878823 0.929855 0.941050 0.935459 0.957517 0.967313 0.930139 0.918657 0.942411 0.931829 0.935547 0.944376 0.946891 0.973308 0.991727 1.000000 0.993374 0.973183 0.966100 0.956690 0.958160 0.948386
Infant mortality rate - 2007 0.717685 0.738995 0.699770 0.855334 0.791353 0.747302 0.784373 0.827329 0.829629 0.847524 0.843017 0.848456 0.828555 0.877715 0.929464 0.937091 0.935602 0.949252 0.937073 0.915736 0.878950 0.929184 0.940341 0.937567 0.957221 0.964581 0.931721 0.912560 0.936226 0.923958 0.926616 0.930236 0.935352 0.960266 0.979286 0.993374 1.000000 0.978150 0.970843 0.957557 0.959499 0.949244
Infant mortality rate - 2008 0.703589 0.727175 0.698895 0.848303 0.784438 0.736278 0.781542 0.819264 0.823803 0.842323 0.840504 0.845345 0.823459 0.874693 0.922969 0.928527 0.929941 0.945121 0.931715 0.910882 0.873996 0.925582 0.934321 0.932920 0.953587 0.959497 0.898568 0.882769 0.908297 0.901087 0.910441 0.910878 0.921067 0.944585 0.954912 0.973183 0.978150 1.000000 0.990823 0.986193 0.982431 0.974718
Infant mortality rate - 2009 0.703417 0.725021 0.696566 0.847587 0.795617 0.751757 0.785576 0.818217 0.820653 0.843715 0.835000 0.841052 0.820541 0.876846 0.921303 0.930964 0.930999 0.945291 0.928582 0.907453 0.875425 0.919677 0.930544 0.923720 0.948511 0.958440 0.900558 0.885592 0.912085 0.902922 0.907944 0.915545 0.919601 0.940920 0.951312 0.966100 0.970843 0.990823 1.000000 0.995346 0.990567 0.980478
Infant mortality rate - 2010 0.693944 0.724636 0.687523 0.846268 0.789349 0.735622 0.769280 0.805993 0.811903 0.836464 0.830042 0.837445 0.815091 0.875291 0.918634 0.921622 0.922586 0.937467 0.919117 0.894565 0.864536 0.909950 0.920315 0.913582 0.935921 0.945837 0.884459 0.868108 0.898805 0.892191 0.900676 0.903567 0.912692 0.934684 0.944884 0.956690 0.957557 0.986193 0.995346 1.000000 0.995444 0.988216
Infant mortality rate - 2011 0.697818 0.722968 0.672007 0.838983 0.783962 0.728340 0.755040 0.793376 0.800510 0.826309 0.825379 0.833799 0.809820 0.874706 0.916384 0.919783 0.919437 0.933329 0.920332 0.893806 0.873955 0.911676 0.922856 0.917087 0.936545 0.948334 0.886631 0.868469 0.896377 0.890289 0.898857 0.903692 0.909744 0.934865 0.942473 0.958160 0.959499 0.982431 0.990567 0.995444 1.000000 0.995593
Infant mortality rate - 2012 0.690637 0.711218 0.661163 0.832381 0.773737 0.709942 0.739052 0.775312 0.778092 0.807073 0.816422 0.824043 0.790432 0.856027 0.906722 0.907303 0.907500 0.921859 0.904692 0.872675 0.857758 0.892336 0.909496 0.904275 0.926044 0.939117 0.875388 0.855466 0.880714 0.875553 0.883466 0.885509 0.892587 0.921736 0.929644 0.948386 0.949244 0.974718 0.980478 0.988216 0.995593 1.000000
In [23]:
for i in ["Infant mortality rate - 1975","Infant mortality rate - 1986","Infant mortality rate - 1997","Infant mortality rate - 2003"]:
    data[i].fillna(data[i].median(),inplace=True)
In [24]:
from sklearn.impute import KNNImputer
impute=KNNImputer()
In [25]:
for i in data.select_dtypes(include="number").columns:
    data[i]=impute.fit_transform(data[[i]])
In [26]:
data.isnull().sum()
Out[26]:
Category                        0
Country/ State/ UT Name         0
Infant mortality rate - 1971    0
Infant mortality rate - 1972    0
Infant mortality rate - 1973    0
Infant mortality rate - 1974    0
Infant mortality rate - 1975    0
Infant mortality rate - 1976    0
Infant mortality rate - 1977    0
Infant mortality rate - 1978    0
Infant mortality rate - 1979    0
Infant mortality rate - 1980    0
Infant mortality rate - 1981    0
Infant mortality rate - 1982    0
Infant mortality rate - 1983    0
Infant mortality rate - 1984    0
Infant mortality rate - 1985    0
Infant mortality rate - 1986    0
Infant mortality rate - 1987    0
Infant mortality rate - 1988    0
Infant mortality rate - 1989    0
Infant mortality rate - 1990    0
Infant mortality rate - 1991    0
Infant mortality rate - 1992    0
Infant mortality rate - 1993    0
Infant mortality rate - 1994    0
Infant mortality rate - 1995    0
Infant mortality rate - 1996    0
Infant mortality rate - 1997    0
Infant mortality rate - 1998    0
Infant mortality rate - 1999    0
Infant mortality rate - 2000    0
Infant mortality rate - 2001    0
Infant mortality rate - 2002    0
Infant mortality rate - 2003    0
Infant mortality rate - 2004    0
Infant mortality rate - 2005    0
Infant mortality rate - 2006    0
Infant mortality rate - 2007    0
Infant mortality rate - 2008    0
Infant mortality rate - 2009    0
Infant mortality rate - 2010    0
Infant mortality rate - 2011    0
Infant mortality rate - 2012    0
dtype: int64
In [27]:
def wisker(col):
    q1,q3=np.percentile(col,[25,75])
    iqr=q3-q1
    lw=q1-1.5*iqr
    uw=q3+1.5*iqr
    return lw,uw
In [28]:
wisker(data['Infant mortality rate - 1992'])
Out[28]:
(72.64705882352942, 73.58823529411765)
In [29]:
for i in ['Infant mortality rate - 1972','Infant mortality rate - 1976','Infant mortality rate - 1977','Infant mortality rate - 1978','Infant mortality rate - 1980','Infant mortality rate - 1983','Infant mortality rate - 1984','Infant mortality rate - 1988','Infant mortality rate - 1989','Infant mortality rate - 1990']:
    lw,uw=wisker(data[i])
    data[i]=np.where(data[i]<lw,lw,data[i])
    data[i]=np.where(data[i]>uw,uw,data[i])
In [30]:
for i in ['Infant mortality rate - 1972','Infant mortality rate - 1976','Infant mortality rate - 1977','Infant mortality rate - 1978','Infant mortality rate - 1980','Infant mortality rate - 1983','Infant mortality rate - 1984','Infant mortality rate - 1988','Infant mortality rate - 1989','Infant mortality rate - 1990']:
    sns.boxplot(data[i])
    plt.show()
In [31]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red
        
        # for leaf node
        self.value = value
In [32]:
class DecisionTreeRegressor():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        best_split = {}
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["var_red"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["var_red"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_var_red = -float("inf")
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_var_red = self.variance_reduction(y, left_y, right_y)
                    # update the best split if needed
                    if curr_var_red>max_var_red:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["var_red"] = curr_var_red
                        max_var_red = curr_var_red
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def variance_reduction(self, parent, l_child, r_child):
        ''' function to compute variance reduction '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        return reduction
    
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        val = np.mean(Y)
        return val
                
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.var_red)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
        
    def make_prediction(self, x, tree):
        ''' function to predict new dataset '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def predict(self, X):
        ''' function to predict a single data point '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
In [33]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)
In [34]:
regressor = DecisionTreeRegressor(min_samples_split=3, max_depth=3)
regressor.fit(X_train,Y_train)
regressor.print_tree()
X_42 <= 30.0 ? 139.7362899005756
 left:X_28 <= 30.0 ? 27.441617357001974
  left:X_2 <= 58.0 ? 0.888888888888889
    left:12.0
    right:10.0
  right:X_41 <= 24.0 ? 5.415
    left:X_29 <= 53.0 ? 1.6875
        left:21.0
        right:18.0
    right:X_2 <= 102.0 ? 1.8
        left:28.0
        right:24.4
 right:X_39 <= 56.0 ? 45.19185185185185
  left:X_35 <= 48.0 ? 11.41358024691358
    left:X_1 <= Dadra and Nagar Haveli ? 0.888888888888889
        left:33.0
        right:35.0
    right:X_28 <= 62.0 ? 2.7222222222222228
        left:38.5
        right:42.0
  right:X_2 <= 114.76923076923077 ? 8.680555555555557
    left:48.0
    right:X_1 <= Madhya Pradesh ? 1.5625
        left:55.5
        right:53.0
In [35]:
Y_pred = regressor.predict(X_test) 
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(Y_test, Y_pred))
Out[35]:
4.571788490295674
In [36]:
data.columns
Out[36]:
Index(['Category', 'Country/ State/ UT Name', 'Infant mortality rate - 1971',
       'Infant mortality rate - 1972', 'Infant mortality rate - 1973',
       'Infant mortality rate - 1974', 'Infant mortality rate - 1975',
       'Infant mortality rate - 1976', 'Infant mortality rate - 1977',
       'Infant mortality rate - 1978', 'Infant mortality rate - 1979',
       'Infant mortality rate - 1980', 'Infant mortality rate - 1981',
       'Infant mortality rate - 1982', 'Infant mortality rate - 1983',
       'Infant mortality rate - 1984', 'Infant mortality rate - 1985',
       'Infant mortality rate - 1986', 'Infant mortality rate - 1987',
       'Infant mortality rate - 1988', 'Infant mortality rate - 1989',
       'Infant mortality rate - 1990', 'Infant mortality rate - 1991',
       'Infant mortality rate - 1992', 'Infant mortality rate - 1993',
       'Infant mortality rate - 1994', 'Infant mortality rate - 1995',
       'Infant mortality rate - 1996', 'Infant mortality rate - 1997',
       'Infant mortality rate - 1998', 'Infant mortality rate - 1999',
       'Infant mortality rate - 2000', 'Infant mortality rate - 2001',
       'Infant mortality rate - 2002', 'Infant mortality rate - 2003',
       'Infant mortality rate - 2004', 'Infant mortality rate - 2005',
       'Infant mortality rate - 2006', 'Infant mortality rate - 2007',
       'Infant mortality rate - 2008', 'Infant mortality rate - 2009',
       'Infant mortality rate - 2010', 'Infant mortality rate - 2011',
       'Infant mortality rate - 2012'],
      dtype='object')
In [37]:
data.drop_duplicates()
Out[37]:
Category Country/ State/ UT Name Infant mortality rate - 1971 Infant mortality rate - 1972 Infant mortality rate - 1973 Infant mortality rate - 1974 Infant mortality rate - 1975 Infant mortality rate - 1976 Infant mortality rate - 1977 Infant mortality rate - 1978 ... Infant mortality rate - 2003 Infant mortality rate - 2004 Infant mortality rate - 2005 Infant mortality rate - 2006 Infant mortality rate - 2007 Infant mortality rate - 2008 Infant mortality rate - 2009 Infant mortality rate - 2010 Infant mortality rate - 2011 Infant mortality rate - 2012
0 Country India (Average) 129.000000 123.142857 134.0 126.0000 140.0 119.820312 114.6875 115.007812 ... 60.0 58.0 58.0 57.0 55.0 53.0 50.0 47.0 44.0 42.0
1 State Andhra Pradesh 106.000000 123.142857 105.0 111.0000 123.0 119.820312 114.6875 115.007812 ... 59.0 59.0 57.0 56.0 54.0 52.0 49.0 46.0 43.0 41.0
2 State Assam 139.000000 123.142857 136.0 136.0000 144.0 119.820312 114.6875 115.007812 ... 67.0 66.0 68.0 67.0 66.0 64.0 61.0 58.0 55.0 55.0
3 State Bihar 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 60.0 61.0 61.0 60.0 58.0 56.0 52.0 48.0 44.0 43.0
4 State Chhattisgarh 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 70.0 60.0 63.0 61.0 59.0 57.0 54.0 51.0 48.0 47.0
5 State Gujarat 144.000000 123.142857 161.0 106.0000 154.0 119.820312 114.6875 115.007812 ... 57.0 53.0 54.0 53.0 52.0 50.0 48.0 44.0 41.0 38.0
6 State Haryana 72.000000 123.142857 104.0 102.0000 114.0 114.007812 114.6875 109.695312 ... 59.0 61.0 60.0 57.0 55.0 54.0 51.0 48.0 44.0 42.0
7 State Jharkhand 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 51.0 49.0 50.0 49.0 48.0 46.0 44.0 42.0 39.0 38.0
8 State Karnataka 95.000000 123.142857 90.0 86.0000 80.0 114.007812 114.6875 109.695312 ... 52.0 49.0 50.0 48.0 47.0 45.0 41.0 38.0 35.0 32.0
9 State Kerala 58.000000 123.142857 58.0 54.0000 54.0 114.007812 114.6875 109.695312 ... 11.0 12.0 14.0 15.0 13.0 12.0 12.0 13.0 12.0 12.0
10 State Madhya Pradesh 135.000000 123.142857 145.0 137.0000 151.0 119.820312 114.6875 115.007812 ... 82.0 79.0 76.0 74.0 72.0 70.0 67.0 62.0 59.0 56.0
11 State Maharashtra 105.000000 123.142857 116.0 89.0000 92.0 114.007812 114.6875 109.695312 ... 42.0 36.0 36.0 35.0 34.0 33.0 31.0 28.0 25.0 25.0
12 State Odisha 127.000000 123.142857 145.0 150.0000 149.0 119.820312 114.6875 115.007812 ... 83.0 77.0 75.0 73.0 71.0 69.0 65.0 61.0 57.0 53.0
13 State Punjab 102.000000 123.142857 115.0 97.0000 98.0 114.007812 114.6875 115.007812 ... 49.0 45.0 44.0 44.0 43.0 41.0 38.0 34.0 30.0 28.0
14 State Rajasthan 114.769231 123.142857 137.0 133.0000 155.0 119.820312 114.6875 115.007812 ... 75.0 67.0 68.0 67.0 65.0 63.0 59.0 55.0 52.0 49.0
15 State Tamil Nadu 113.000000 123.142857 108.0 106.0000 112.0 114.007812 114.6875 109.695312 ... 43.0 41.0 37.0 37.0 35.0 31.0 28.0 24.0 22.0 21.0
16 State Uttar Pradesh 167.000000 123.142857 176.0 172.0000 198.0 119.820312 114.6875 115.007812 ... 76.0 72.0 73.0 71.0 69.0 67.0 63.0 61.0 57.0 53.0
17 State West Bengal 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 46.0 40.0 38.0 38.0 37.0 35.0 33.0 31.0 32.0 32.0
18 State Arunachal Pradesh 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 34.0 38.0 37.0 40.0 37.0 32.0 32.0 31.0 32.0 33.0
19 State Goa 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 16.0 17.0 16.0 15.0 13.0 10.0 11.0 10.0 11.0 10.0
20 State Himachal Pradesh 114.769231 123.142857 85.0 100.0000 118.0 119.820312 114.6875 109.695312 ... 42.0 51.0 49.0 50.0 47.0 44.0 45.0 40.0 38.0 36.0
21 State Jammu and Kashmir 114.769231 123.142857 65.0 76.0000 68.0 114.007812 114.6875 109.695312 ... 45.0 49.0 50.0 52.0 51.0 49.0 45.0 43.0 41.0 39.0
22 State Manipur 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 16.0 14.0 13.0 11.0 12.0 14.0 16.0 14.0 11.0 10.0
23 State Meghalaya 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 57.0 54.0 49.0 53.0 56.0 58.0 59.0 55.0 52.0 49.0
24 State Mizoram 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 16.0 19.0 20.0 25.0 23.0 37.0 36.0 37.0 34.0 35.0
25 State Nagaland 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 45.0 17.0 18.0 20.0 21.0 26.0 26.0 23.0 21.0 18.0
26 State Sikkim 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 33.0 32.0 30.0 33.0 34.0 33.0 34.0 30.0 26.0 24.0
27 State Tripura 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 32.0 32.0 31.0 36.0 39.0 34.0 31.0 27.0 29.0 28.0
28 State Uttarakhand 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 41.0 42.0 42.0 43.0 48.0 44.0 41.0 38.0 36.0 34.0
29 Union Territory Andaman and Nicobar Islands 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 18.0 19.0 27.0 31.0 34.0 31.0 27.0 25.0 23.0 24.0
30 Union Territory Chandigarh 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 19.0 21.0 19.0 23.0 27.0 28.0 25.0 22.0 20.0 20.0
31 Union Territory Dadra and Nagar Haveli 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 54.0 48.0 42.0 35.0 34.0 34.0 37.0 38.0 35.0 33.0
32 Union Territory Daman and Diu 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 39.0 37.0 28.0 28.0 27.0 31.0 24.0 23.0 22.0 22.0
33 Union Territory Delhi 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 28.0 32.0 35.0 37.0 36.0 35.0 33.0 30.0 28.0 25.0
34 Union Territory Lakshadweep 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 26.0 30.0 22.0 25.0 24.0 31.0 25.0 25.0 24.0 24.0
35 Union Territory Puducherry 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 ... 24.0 24.0 28.0 28.0 25.0 25.0 22.0 22.0 19.0 17.0

36 rows × 44 columns

In [38]:
dummy=pd.get_dummies(data=data,columns=["Category","Country/ State/ UT Name"],drop_first=True)
In [39]:
dummy
Out[39]:
Infant mortality rate - 1971 Infant mortality rate - 1972 Infant mortality rate - 1973 Infant mortality rate - 1974 Infant mortality rate - 1975 Infant mortality rate - 1976 Infant mortality rate - 1977 Infant mortality rate - 1978 Infant mortality rate - 1979 Infant mortality rate - 1980 ... Country/ State/ UT Name_Odisha Country/ State/ UT Name_Puducherry Country/ State/ UT Name_Punjab Country/ State/ UT Name_Rajasthan Country/ State/ UT Name_Sikkim Country/ State/ UT Name_Tamil Nadu Country/ State/ UT Name_Tripura Country/ State/ UT Name_Uttar Pradesh Country/ State/ UT Name_Uttarakhand Country/ State/ UT Name_West Bengal
0 129.000000 123.142857 134.0 126.0000 140.0 119.820312 114.6875 115.007812 120.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
1 106.000000 123.142857 105.0 111.0000 123.0 119.820312 114.6875 115.007812 106.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
2 139.000000 123.142857 136.0 136.0000 144.0 119.820312 114.6875 115.007812 104.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
3 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
4 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
5 144.000000 123.142857 161.0 106.0000 154.0 119.820312 114.6875 115.007812 123.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
6 72.000000 123.142857 104.0 102.0000 114.0 114.007812 114.6875 109.695312 100.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
7 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
8 95.000000 123.142857 90.0 86.0000 80.0 114.007812 114.6875 109.695312 83.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
9 58.000000 123.142857 58.0 54.0000 54.0 114.007812 114.6875 109.695312 43.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
10 135.000000 123.142857 145.0 137.0000 151.0 119.820312 114.6875 115.007812 143.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
11 105.000000 123.142857 116.0 89.0000 92.0 114.007812 114.6875 109.695312 86.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
12 127.000000 123.142857 145.0 150.0000 149.0 119.820312 114.6875 115.007812 149.000 100.0625 ... 1 0 0 0 0 0 0 0 0 0
13 102.000000 123.142857 115.0 97.0000 98.0 114.007812 114.6875 115.007812 92.000 100.0625 ... 0 0 1 0 0 0 0 0 0 0
14 114.769231 123.142857 137.0 133.0000 155.0 119.820312 114.6875 115.007812 108.000 100.0625 ... 0 0 0 1 0 0 0 0 0 0
15 113.000000 123.142857 108.0 106.0000 112.0 114.007812 114.6875 109.695312 100.000 100.0625 ... 0 0 0 0 0 1 0 0 0 0
16 167.000000 123.142857 176.0 172.0000 198.0 119.820312 114.6875 115.007812 162.000 100.0625 ... 0 0 0 0 0 0 0 1 0 0
17 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 1
18 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
19 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
20 114.769231 123.142857 85.0 100.0000 118.0 119.820312 114.6875 109.695312 87.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
21 114.769231 123.142857 65.0 76.0000 68.0 114.007812 114.6875 109.695312 76.000 100.0625 ... 0 0 0 0 0 0 0 0 0 0
22 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
23 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
24 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
25 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
26 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 1 0 0 0 0 0
27 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 1 0 0 0
28 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 1 0
29 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
30 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
31 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
32 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
33 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
34 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 0 0 0 0 0 0 0 0 0
35 114.769231 123.142857 117.5 111.3125 120.5 116.187500 114.6875 111.687500 105.125 100.0625 ... 0 1 0 0 0 0 0 0 0 0

36 rows × 79 columns

In [40]:
dummy.dtypes
Out[40]:
Infant mortality rate - 1971             float64
Infant mortality rate - 1972             float64
Infant mortality rate - 1973             float64
Infant mortality rate - 1974             float64
Infant mortality rate - 1975             float64
                                          ...   
Country/ State/ UT Name_Tamil Nadu         uint8
Country/ State/ UT Name_Tripura            uint8
Country/ State/ UT Name_Uttar Pradesh      uint8
Country/ State/ UT Name_Uttarakhand        uint8
Country/ State/ UT Name_West Bengal        uint8
Length: 79, dtype: object
In [41]:
plt.figure(figsize=(15,5))

plt.subplot(1,2,1)
sns.kdeplot(x=dummy['Infant mortality rate - 1973'])

plt.subplot(1,2,2)
sns.kdeplot(x=np.log(dummy['Infant mortality rate - 1973']))
Out[41]:
<AxesSubplot:xlabel='Infant mortality rate - 1973', ylabel='Density'>
In [42]:
dummy['Infant mortality rate - 1973']=np.log(dummy['Infant mortality rate - 1973'])
In [43]:
dummy.columns
Out[43]:
Index(['Infant mortality rate - 1971', 'Infant mortality rate - 1972',
       'Infant mortality rate - 1973', 'Infant mortality rate - 1974',
       'Infant mortality rate - 1975', 'Infant mortality rate - 1976',
       'Infant mortality rate - 1977', 'Infant mortality rate - 1978',
       'Infant mortality rate - 1979', 'Infant mortality rate - 1980',
       'Infant mortality rate - 1981', 'Infant mortality rate - 1982',
       'Infant mortality rate - 1983', 'Infant mortality rate - 1984',
       'Infant mortality rate - 1985', 'Infant mortality rate - 1986',
       'Infant mortality rate - 1987', 'Infant mortality rate - 1988',
       'Infant mortality rate - 1989', 'Infant mortality rate - 1990',
       'Infant mortality rate - 1991', 'Infant mortality rate - 1992',
       'Infant mortality rate - 1993', 'Infant mortality rate - 1994',
       'Infant mortality rate - 1995', 'Infant mortality rate - 1996',
       'Infant mortality rate - 1997', 'Infant mortality rate - 1998',
       'Infant mortality rate - 1999', 'Infant mortality rate - 2000',
       'Infant mortality rate - 2001', 'Infant mortality rate - 2002',
       'Infant mortality rate - 2003', 'Infant mortality rate - 2004',
       'Infant mortality rate - 2005', 'Infant mortality rate - 2006',
       'Infant mortality rate - 2007', 'Infant mortality rate - 2008',
       'Infant mortality rate - 2009', 'Infant mortality rate - 2010',
       'Infant mortality rate - 2011', 'Infant mortality rate - 2012',
       'Category_State', 'Category_Union Territory',
       'Country/ State/ UT Name_Andhra Pradesh',
       'Country/ State/ UT Name_Arunachal Pradesh',
       'Country/ State/ UT Name_Assam', 'Country/ State/ UT Name_Bihar',
       'Country/ State/ UT Name_Chandigarh',
       'Country/ State/ UT Name_Chhattisgarh',
       'Country/ State/ UT Name_Dadra and Nagar Haveli',
       'Country/ State/ UT Name_Daman and Diu',
       'Country/ State/ UT Name_Delhi', 'Country/ State/ UT Name_Goa',
       'Country/ State/ UT Name_Gujarat', 'Country/ State/ UT Name_Haryana',
       'Country/ State/ UT Name_Himachal Pradesh',
       'Country/ State/ UT Name_India (Average)',
       'Country/ State/ UT Name_Jammu and Kashmir',
       'Country/ State/ UT Name_Jharkhand',
       'Country/ State/ UT Name_Karnataka', 'Country/ State/ UT Name_Kerala',
       'Country/ State/ UT Name_Lakshadweep',
       'Country/ State/ UT Name_Madhya Pradesh',
       'Country/ State/ UT Name_Maharashtra',
       'Country/ State/ UT Name_Manipur', 'Country/ State/ UT Name_Meghalaya',
       'Country/ State/ UT Name_Mizoram', 'Country/ State/ UT Name_Nagaland',
       'Country/ State/ UT Name_Odisha', 'Country/ State/ UT Name_Puducherry',
       'Country/ State/ UT Name_Punjab', 'Country/ State/ UT Name_Rajasthan',
       'Country/ State/ UT Name_Sikkim', 'Country/ State/ UT Name_Tamil Nadu',
       'Country/ State/ UT Name_Tripura',
       'Country/ State/ UT Name_Uttar Pradesh',
       'Country/ State/ UT Name_Uttarakhand',
       'Country/ State/ UT Name_West Bengal'],
      dtype='object')
In [44]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
In [45]:
x=dummy.drop('Infant mortality rate - 1973',axis=1)
y=dummy['Infant mortality rate - 1973']
In [46]:
x_c=sm.add_constant(x)
In [47]:
x_train, x_test, y_train, y_test = train_test_split(x_c, y, test_size=30, random_state=42)
In [48]:
model=sm.OLS(y_train,x_train.astype(float)).fit()
In [49]:
import warnings
warnings.filterwarnings("ignore")
model.summary()
Out[49]:
OLS Regression Results
Dep. Variable: Infant mortality rate - 1973 R-squared: 1.000
Model: OLS Adj. R-squared: nan
Method: Least Squares F-statistic: nan
Date: Tue, 16 Jan 2024 Prob (F-statistic): nan
Time: 01:50:34 Log-Likelihood: 183.49
No. Observations: 6 AIC: -355.0
Df Residuals: 0 BIC: -356.2
Df Model: 5
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Infant mortality rate - 1971 0.0012 inf 0 nan nan nan
Infant mortality rate - 1972 0.0042 inf 0 nan nan nan
Infant mortality rate - 1974 0.0008 inf 0 nan nan nan
Infant mortality rate - 1975 -0.0004 inf -0 nan nan nan
Infant mortality rate - 1976 0.0033 inf 0 nan nan nan
Infant mortality rate - 1977 0.0039 inf 0 nan nan nan
Infant mortality rate - 1978 0.0033 inf 0 nan nan nan
Infant mortality rate - 1979 0.0033 inf 0 nan nan nan
Infant mortality rate - 1980 0.0034 inf 0 nan nan nan
Infant mortality rate - 1981 0.0029 inf 0 nan nan nan
Infant mortality rate - 1982 0.0031 inf 0 nan nan nan
Infant mortality rate - 1983 0.0027 inf 0 nan nan nan
Infant mortality rate - 1984 0.0031 inf 0 nan nan nan
Infant mortality rate - 1985 0.0009 inf 0 nan nan nan
Infant mortality rate - 1986 0.0004 inf 0 nan nan nan
Infant mortality rate - 1987 0.0013 inf 0 nan nan nan
Infant mortality rate - 1988 0.0028 inf 0 nan nan nan
Infant mortality rate - 1989 0.0027 inf 0 nan nan nan
Infant mortality rate - 1990 0.0017 inf 0 nan nan nan
Infant mortality rate - 1991 0.0016 inf 0 nan nan nan
Infant mortality rate - 1992 0.0007 inf 0 nan nan nan
Infant mortality rate - 1993 0.0007 inf 0 nan nan nan
Infant mortality rate - 1994 0.0007 inf 0 nan nan nan
Infant mortality rate - 1995 0.0003 inf 0 nan nan nan
Infant mortality rate - 1996 8.83e-05 inf 0 nan nan nan
Infant mortality rate - 1997 -0.0002 inf -0 nan nan nan
Infant mortality rate - 1998 -0.0003 inf -0 nan nan nan
Infant mortality rate - 1999 0.0007 inf 0 nan nan nan
Infant mortality rate - 2000 0.0008 inf 0 nan nan nan
Infant mortality rate - 2001 6.928e-05 inf 0 nan nan nan
Infant mortality rate - 2002 -0.0013 inf -0 nan nan nan
Infant mortality rate - 2003 -0.0008 inf -0 nan nan nan
Infant mortality rate - 2004 -8.103e-05 inf -0 nan nan nan
Infant mortality rate - 2005 -0.0001 inf -0 nan nan nan
Infant mortality rate - 2006 -0.0003 inf -0 nan nan nan
Infant mortality rate - 2007 2.274e-06 inf 0 nan nan nan
Infant mortality rate - 2008 -3.716e-05 inf -0 nan nan nan
Infant mortality rate - 2009 -1.639e-05 inf -0 nan nan nan
Infant mortality rate - 2010 0.0001 inf 0 nan nan nan
Infant mortality rate - 2011 -3.492e-05 inf -0 nan nan nan
Infant mortality rate - 2012 8.763e-05 inf 0 nan nan nan
Category_State 3.421e-05 inf 0 nan nan nan
Category_Union Territory 0 nan nan nan nan nan
Country/ State/ UT Name_Andhra Pradesh 0 nan nan nan nan nan
Country/ State/ UT Name_Arunachal Pradesh 0 nan nan nan nan nan
Country/ State/ UT Name_Assam 0 nan nan nan nan nan
Country/ State/ UT Name_Bihar 0 nan nan nan nan nan
Country/ State/ UT Name_Chandigarh 0 nan nan nan nan nan
Country/ State/ UT Name_Chhattisgarh 0 nan nan nan nan nan
Country/ State/ UT Name_Dadra and Nagar Haveli 0 nan nan nan nan nan
Country/ State/ UT Name_Daman and Diu 0 nan nan nan nan nan
Country/ State/ UT Name_Delhi 0 nan nan nan nan nan
Country/ State/ UT Name_Goa 0 nan nan nan nan nan
Country/ State/ UT Name_Gujarat 0 nan nan nan nan nan
Country/ State/ UT Name_Haryana 6.266e-05 inf 0 nan nan nan
Country/ State/ UT Name_Himachal Pradesh -1.921e-05 inf -0 nan nan nan
Country/ State/ UT Name_India (Average) 0 nan nan nan nan nan
Country/ State/ UT Name_Jammu and Kashmir 0 nan nan nan nan nan
Country/ State/ UT Name_Jharkhand 7.358e-05 inf 0 nan nan nan
Country/ State/ UT Name_Karnataka 0 nan nan nan nan nan
Country/ State/ UT Name_Kerala 0 nan nan nan nan nan
Country/ State/ UT Name_Lakshadweep 0 nan nan nan nan nan
Country/ State/ UT Name_Madhya Pradesh 0 nan nan nan nan nan
Country/ State/ UT Name_Maharashtra 0 nan nan nan nan nan
Country/ State/ UT Name_Manipur 0 nan nan nan nan nan
Country/ State/ UT Name_Meghalaya 0 nan nan nan nan nan
Country/ State/ UT Name_Mizoram 0 nan nan nan nan nan
Country/ State/ UT Name_Nagaland 2.317e-06 inf 0 nan nan nan
Country/ State/ UT Name_Odisha 0 nan nan nan nan nan
Country/ State/ UT Name_Puducherry 0 nan nan nan nan nan
Country/ State/ UT Name_Punjab 0 nan nan nan nan nan
Country/ State/ UT Name_Rajasthan -0.0001 inf -0 nan nan nan
Country/ State/ UT Name_Sikkim 0 nan nan nan nan nan
Country/ State/ UT Name_Tamil Nadu 0 nan nan nan nan nan
Country/ State/ UT Name_Tripura 0 nan nan nan nan nan
Country/ State/ UT Name_Uttar Pradesh 0 nan nan nan nan nan
Country/ State/ UT Name_Uttarakhand 3.456e-05 inf 0 nan nan nan
Country/ State/ UT Name_West Bengal 0 nan nan nan nan nan
Omnibus: nan Durbin-Watson: 0.018
Prob(Omnibus): nan Jarque-Bera (JB): 0.329
Skew: 0.495 Prob(JB): 0.848
Kurtosis: 2.422 Cond. No. 67.0


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The input rank is higher than the number of observations.
In [50]:
y_predict=model.predict(x_test.astype(float))
In [51]:
y_predict
Out[51]:
35    4.783727
13    4.400790
26    4.784565
30    4.794046
16    5.569802
31    4.743666
21    4.411965
12    5.453502
8     4.370448
17    4.589171
9     3.689537
34    4.786573
0     4.985427
4     4.716318
29    4.795674
15    4.541666
19    4.800954
5     4.976922
11    4.366440
1     4.597989
24    4.801698
2     4.942234
33    4.773353
3     4.906418
32    4.761697
23    4.733529
27    4.774203
10    5.405815
22    4.807343
18    4.771933
dtype: float64
In [52]:
residual=y_test-y_predict
residual
Out[52]:
35   -0.017289
13    0.344142
26   -0.018127
30   -0.027607
16   -0.399318
31    0.022773
21   -0.237578
12   -0.476768
8     0.129362
17    0.177267
9     0.370906
34   -0.020134
0    -0.087587
4     0.050120
29   -0.029236
15    0.140465
19   -0.034516
5     0.104483
11    0.387150
1     0.055971
24   -0.035259
2    -0.029579
33   -0.006914
3    -0.139980
32    0.004741
23    0.032910
27   -0.007765
10   -0.429081
22   -0.040905
18   -0.005495
dtype: float64
In [53]:
np.sqrt((residual**2).mean())
Out[53]:
0.19615621233855746
In [54]:
y_test.mean()
Out[54]:
4.753818534966868
In [55]:
0.19615621233855746/4.753818534966868
Out[55]:
0.0412628733923527
In [56]:
residual
Out[56]:
35   -0.017289
13    0.344142
26   -0.018127
30   -0.027607
16   -0.399318
31    0.022773
21   -0.237578
12   -0.476768
8     0.129362
17    0.177267
9     0.370906
34   -0.020134
0    -0.087587
4     0.050120
29   -0.029236
15    0.140465
19   -0.034516
5     0.104483
11    0.387150
1     0.055971
24   -0.035259
2    -0.029579
33   -0.006914
3    -0.139980
32    0.004741
23    0.032910
27   -0.007765
10   -0.429081
22   -0.040905
18   -0.005495
dtype: float64
In [57]:
residual.mean()
Out[57]:
-0.007428336446615861
In [58]:
print(residual.skew())
sns.kdeplot(residual)
-0.37582290538116186
Out[58]:
<AxesSubplot:ylabel='Density'>
In [59]:
sns.scatterplot(x=y_predict,y=residual)
Out[59]:
<AxesSubplot:>
In [60]:
a=pd.Series(residual,name="residual")
b=pd.Series(y_predict,name="y_predict")
In [61]:
ab=pd.DataFrame(a)
In [62]:
rp=pd.merge(a,b,left_index=True,right_index=True)
In [63]:
sns.lmplot(data=rp,x="y_predict",y="residual")
Out[63]:
<seaborn.axisgrid.FacetGrid at 0x29390efaf70>
In [64]:
Y_pred = regressor.predict(X_test) 
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(Y_test, Y_pred))
Out[64]:
4.571788490295674
In [65]:
data.shape
Out[65]:
(36, 44)
In [66]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 44 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Category                      36 non-null     object 
 1   Country/ State/ UT Name       36 non-null     object 
 2   Infant mortality rate - 1971  36 non-null     float64
 3   Infant mortality rate - 1972  36 non-null     float64
 4   Infant mortality rate - 1973  36 non-null     float64
 5   Infant mortality rate - 1974  36 non-null     float64
 6   Infant mortality rate - 1975  36 non-null     float64
 7   Infant mortality rate - 1976  36 non-null     float64
 8   Infant mortality rate - 1977  36 non-null     float64
 9   Infant mortality rate - 1978  36 non-null     float64
 10  Infant mortality rate - 1979  36 non-null     float64
 11  Infant mortality rate - 1980  36 non-null     float64
 12  Infant mortality rate - 1981  36 non-null     float64
 13  Infant mortality rate - 1982  36 non-null     float64
 14  Infant mortality rate - 1983  36 non-null     float64
 15  Infant mortality rate - 1984  36 non-null     float64
 16  Infant mortality rate - 1985  36 non-null     float64
 17  Infant mortality rate - 1986  36 non-null     float64
 18  Infant mortality rate - 1987  36 non-null     float64
 19  Infant mortality rate - 1988  36 non-null     float64
 20  Infant mortality rate - 1989  36 non-null     float64
 21  Infant mortality rate - 1990  36 non-null     float64
 22  Infant mortality rate - 1991  36 non-null     float64
 23  Infant mortality rate - 1992  36 non-null     float64
 24  Infant mortality rate - 1993  36 non-null     float64
 25  Infant mortality rate - 1994  36 non-null     float64
 26  Infant mortality rate - 1995  36 non-null     float64
 27  Infant mortality rate - 1996  36 non-null     float64
 28  Infant mortality rate - 1997  36 non-null     float64
 29  Infant mortality rate - 1998  36 non-null     float64
 30  Infant mortality rate - 1999  36 non-null     float64
 31  Infant mortality rate - 2000  36 non-null     float64
 32  Infant mortality rate - 2001  36 non-null     float64
 33  Infant mortality rate - 2002  36 non-null     float64
 34  Infant mortality rate - 2003  36 non-null     float64
 35  Infant mortality rate - 2004  36 non-null     float64
 36  Infant mortality rate - 2005  36 non-null     float64
 37  Infant mortality rate - 2006  36 non-null     float64
 38  Infant mortality rate - 2007  36 non-null     float64
 39  Infant mortality rate - 2008  36 non-null     float64
 40  Infant mortality rate - 2009  36 non-null     float64
 41  Infant mortality rate - 2010  36 non-null     float64
 42  Infant mortality rate - 2011  36 non-null     float64
 43  Infant mortality rate - 2012  36 non-null     float64
dtypes: float64(42), object(2)
memory usage: 12.5+ KB
In [67]:
data.nunique()
Out[67]:
Category                         3
Country/ State/ UT Name         36
Infant mortality rate - 1971    14
Infant mortality rate - 1972     1
Infant mortality rate - 1973    16
Infant mortality rate - 1974    16
Infant mortality rate - 1975    17
Infant mortality rate - 1976     3
Infant mortality rate - 1977     1
Infant mortality rate - 1978     3
Infant mortality rate - 1979    16
Infant mortality rate - 1980     1
Infant mortality rate - 1981    18
Infant mortality rate - 1982    18
Infant mortality rate - 1983     6
Infant mortality rate - 1984     4
Infant mortality rate - 1985    18
Infant mortality rate - 1986    18
Infant mortality rate - 1987    17
Infant mortality rate - 1988     4
Infant mortality rate - 1989     5
Infant mortality rate - 1990     8
Infant mortality rate - 1991    17
Infant mortality rate - 1992    17
Infant mortality rate - 1993    16
Infant mortality rate - 1994    16
Infant mortality rate - 1995    15
Infant mortality rate - 1996    16
Infant mortality rate - 1997    23
Infant mortality rate - 1998    30
Infant mortality rate - 1999    30
Infant mortality rate - 2000    26
Infant mortality rate - 2001    29
Infant mortality rate - 2002    29
Infant mortality rate - 2003    29
Infant mortality rate - 2004    29
Infant mortality rate - 2005    29
Infant mortality rate - 2006    28
Infant mortality rate - 2007    27
Infant mortality rate - 2008    29
Infant mortality rate - 2009    30
Infant mortality rate - 2010    26
Infant mortality rate - 2011    27
Infant mortality rate - 2012    25
dtype: int64
In [68]:
data.columns
Out[68]:
Index(['Category', 'Country/ State/ UT Name', 'Infant mortality rate - 1971',
       'Infant mortality rate - 1972', 'Infant mortality rate - 1973',
       'Infant mortality rate - 1974', 'Infant mortality rate - 1975',
       'Infant mortality rate - 1976', 'Infant mortality rate - 1977',
       'Infant mortality rate - 1978', 'Infant mortality rate - 1979',
       'Infant mortality rate - 1980', 'Infant mortality rate - 1981',
       'Infant mortality rate - 1982', 'Infant mortality rate - 1983',
       'Infant mortality rate - 1984', 'Infant mortality rate - 1985',
       'Infant mortality rate - 1986', 'Infant mortality rate - 1987',
       'Infant mortality rate - 1988', 'Infant mortality rate - 1989',
       'Infant mortality rate - 1990', 'Infant mortality rate - 1991',
       'Infant mortality rate - 1992', 'Infant mortality rate - 1993',
       'Infant mortality rate - 1994', 'Infant mortality rate - 1995',
       'Infant mortality rate - 1996', 'Infant mortality rate - 1997',
       'Infant mortality rate - 1998', 'Infant mortality rate - 1999',
       'Infant mortality rate - 2000', 'Infant mortality rate - 2001',
       'Infant mortality rate - 2002', 'Infant mortality rate - 2003',
       'Infant mortality rate - 2004', 'Infant mortality rate - 2005',
       'Infant mortality rate - 2006', 'Infant mortality rate - 2007',
       'Infant mortality rate - 2008', 'Infant mortality rate - 2009',
       'Infant mortality rate - 2010', 'Infant mortality rate - 2011',
       'Infant mortality rate - 2012'],
      dtype='object')
In [69]:
new_data = data[['Infant mortality rate - 2003', 'Infant mortality rate - 2012']]
new_data
Out[69]:
Infant mortality rate - 2003 Infant mortality rate - 2012
0 60.0 42.0
1 59.0 41.0
2 67.0 55.0
3 60.0 43.0
4 70.0 47.0
5 57.0 38.0
6 59.0 42.0
7 51.0 38.0
8 52.0 32.0
9 11.0 12.0
10 82.0 56.0
11 42.0 25.0
12 83.0 53.0
13 49.0 28.0
14 75.0 49.0
15 43.0 21.0
16 76.0 53.0
17 46.0 32.0
18 34.0 33.0
19 16.0 10.0
20 42.0 36.0
21 45.0 39.0
22 16.0 10.0
23 57.0 49.0
24 16.0 35.0
25 45.0 18.0
26 33.0 24.0
27 32.0 28.0
28 41.0 34.0
29 18.0 24.0
30 19.0 20.0
31 54.0 33.0
32 39.0 22.0
33 28.0 25.0
34 26.0 24.0
35 24.0 17.0
In [70]:
new_data.plot(x='Infant mortality rate - 2003', y='Infant mortality rate - 2012', kind='scatter')	
Out[70]:
<AxesSubplot:xlabel='Infant mortality rate - 2003', ylabel='Infant mortality rate - 2012'>
In [71]:
data.plot(x='Infant mortality rate - 1998', y='Infant mortality rate - 2012', kind='scatter')
Out[71]:
<AxesSubplot:xlabel='Infant mortality rate - 1998', ylabel='Infant mortality rate - 2012'>
In [72]:
data.plot(x='Infant mortality rate - 2002', y='Infant mortality rate - 2012', kind='scatter')
Out[72]:
<AxesSubplot:xlabel='Infant mortality rate - 2002', ylabel='Infant mortality rate - 2012'>
In [73]:
data.plot(x='Infant mortality rate - 2001', y='Infant mortality rate - 2012', kind='scatter')
Out[73]:
<AxesSubplot:xlabel='Infant mortality rate - 2001', ylabel='Infant mortality rate - 2012'>
In [74]:
data.plot(x='Infant mortality rate - 2000', y='Infant mortality rate - 2012', kind='scatter')
Out[74]:
<AxesSubplot:xlabel='Infant mortality rate - 2000', ylabel='Infant mortality rate - 2012'>
In [75]:
x = new_data['Infant mortality rate - 2012']
y = new_data['Infant mortality rate - 2012']
In [76]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
In [77]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=30, random_state=42)
In [78]:
print("X_train:",x_train.shape)
print("X_test:",x_test.shape)
print("Y_train:",y_train.shape)
print("Y_test:",y_test.shape)
X_train: (6,)
X_test: (30,)
Y_train: (6,)
Y_test: (30,)
In [79]:
model = LinearRegression()
In [80]:
model.fit(x_train.values.reshape(-1,1), y_train)
Out[80]:
LinearRegression()
In [81]:
model.coef_
Out[81]:
array([1.])
In [82]:
model.intercept_
Out[82]:
0.0
In [83]:
y_pred = model.predict(x_test.values.reshape(-1,1))
In [84]:
mse = mean_squared_error(y_test, y_pred)
print("MSE --> ", mse)
MSE -->  0.0
In [85]:
import math
rmse = math.sqrt(mse)
print("RMSE --> ", rmse)
RMSE -->  0.0
In [86]:
mae = mean_absolute_error(y_test, y_pred)
print("MAE --> ", mae)
MAE -->  0.0
In [87]:
r2 = r2_score(y_test, y_pred)
print("R2 --> ", r2)
R2 -->  1.0
In [88]:
print("MSE --> ", mse)
print("RMSE --> ", rmse)
print("MAE --> ", mae)
print("R2 --> ", r2)
MSE -->  0.0
RMSE -->  0.0
MAE -->  0.0
R2 -->  1.0
In [89]:
import matplotlib.pyplot as plt
import seaborn as sns
In [90]:
plt.scatter(y_test, y_pred)
plt.xlabel('Actual')
plt.ylabel('Predicted')
Out[90]:
Text(0, 0.5, 'Predicted')
In [91]:
sns.regplot(x=x, y=y, ci=None, color ='blue')
Out[91]:
<AxesSubplot:xlabel='Infant mortality rate - 2012', ylabel='Infant mortality rate - 2012'>
In [92]:
data.plot(x='Infant mortality rate - 1971', y='Infant mortality rate - 2000', kind='scatter')
Out[92]:
<AxesSubplot:xlabel='Infant mortality rate - 1971', ylabel='Infant mortality rate - 2000'>
In [93]:
x = data[['Infant mortality rate - 2003', 'Infant mortality rate - 1971']]
x
Out[93]:
Infant mortality rate - 2003 Infant mortality rate - 1971
0 60.0 129.000000
1 59.0 106.000000
2 67.0 139.000000
3 60.0 114.769231
4 70.0 114.769231
5 57.0 144.000000
6 59.0 72.000000
7 51.0 114.769231
8 52.0 95.000000
9 11.0 58.000000
10 82.0 135.000000
11 42.0 105.000000
12 83.0 127.000000
13 49.0 102.000000
14 75.0 114.769231
15 43.0 113.000000
16 76.0 167.000000
17 46.0 114.769231
18 34.0 114.769231
19 16.0 114.769231
20 42.0 114.769231
21 45.0 114.769231
22 16.0 114.769231
23 57.0 114.769231
24 16.0 114.769231
25 45.0 114.769231
26 33.0 114.769231
27 32.0 114.769231
28 41.0 114.769231
29 18.0 114.769231
30 19.0 114.769231
31 54.0 114.769231
32 39.0 114.769231
33 28.0 114.769231
34 26.0 114.769231
35 24.0 114.769231
In [94]:
y
Out[94]:
0     42.0
1     41.0
2     55.0
3     43.0
4     47.0
5     38.0
6     42.0
7     38.0
8     32.0
9     12.0
10    56.0
11    25.0
12    53.0
13    28.0
14    49.0
15    21.0
16    53.0
17    32.0
18    33.0
19    10.0
20    36.0
21    39.0
22    10.0
23    49.0
24    35.0
25    18.0
26    24.0
27    28.0
28    34.0
29    24.0
30    20.0
31    33.0
32    22.0
33    25.0
34    24.0
35    17.0
Name: Infant mortality rate - 2012, dtype: float64
In [95]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
In [96]:
print("X_train:",x_train.shape)
print("X_test:",x_test.shape)
print("Y_train:",y_train.shape)
print("Y_test:",y_test.shape)
X_train: (28, 2)
X_test: (8, 2)
Y_train: (28,)
Y_test: (8,)
In [97]:
model = LinearRegression()
In [98]:
model.fit(x_train, y_train)
Out[98]:
LinearRegression()
In [99]:
model.coef_
Out[99]:
array([0.50520261, 0.05330363])
In [100]:
model.intercept_
Out[100]:
4.1920604574834925
In [101]:
y_pred = model.predict(x_test)
In [102]:
mse_2 = mean_squared_error(y_test, y_pred)
rmse_2 = math.sqrt(mse_2)
mae_2 = mean_absolute_error(y_test, y_pred)
r2_2 = r2_score(y_test, y_pred)
In [103]:
print("MSE --> ", mse_2)
print("RMSE --> ", rmse_2)
print("MAE --> ", mae_2)
print("R2 --> ", r2_2)
MSE -->  41.85230251833887
RMSE -->  6.469335554625287
MAE -->  5.341529882737742
R2 -->  0.8487721679554151
In [104]:
plt.scatter(y_test, y_pred)
plt.xlabel('Actual')
plt.ylabel('Predicted')
Out[104]:
Text(0, 0.5, 'Predicted')
In [105]:
sns.regplot(x=y_test, y=y_pred, ci=None, color ='blue')
Out[105]:
<AxesSubplot:xlabel='Infant mortality rate - 2012'>
In [106]:
metrics = {
    'Model': ['First', 'Second'],
    'MSE' : [mse, mse_2],
    'RMSE' : [rmse, rmse_2],
    'MAE' : [mae, mae_2],
    'R2' : [r2, r2_2]
    }

metrics_data = pd.DataFrame(data=metrics)
In [107]:
metrics_data
Out[107]:
Model MSE RMSE MAE R2
0 First 0.000000 0.000000 0.00000 1.000000
1 Second 41.852303 6.469336 5.34153 0.848772
In [108]:
sns.lmplot(data=rp,x="y_predict",y="residual")
Out[108]:
<seaborn.axisgrid.FacetGrid at 0x293974436a0>
In [ ]:
 
In [ ]:
 
In [ ]: